suppressPackageStartupMessages({
library(Seurat)
library(rafalib)
library(cowplot)
library(plotly)
options(rgl.printRglwidget = TRUE)
library(Matrix)
library(sparseMatrixStats)
library(slingshot)
library(tradeSeq)
})
# Define some color palette
pal <- c((scales::hue_pal())(8), RColorBrewer::brewer.pal(9, "Set1"), RColorBrewer::brewer.pal(8,
"Set2"))
set.seed(1)
pal <- rep(sample(pal, length(pal)), 200)Nice function to easily draw a graph:
# Add graph to the base R graphics plot
draw_graph <- function(layout, graph, lwd = 0.2, col = "grey") {
res <- rep(x = 1:(length(graph@p) - 1), times = (graph@p[-1] - graph@p[-length(graph@p)]))
segments(x0 = layout[graph@i + 1, 1], x1 = layout[res, 1], y0 = layout[graph@i +
1, 2], y1 = layout[res, 2], lwd = lwd, col = col)
}In order to speed up the computations during the exercises, we will be using a subset of a bone marrow dataset (originally containing about 100K cells). The bone marrow is the source of adult immune cells, and contains virtually all differentiation stages of cell from the immune system which later circulate in the blood to all other organs.
You can download the files we prepared with these commands:
webpath <- "https://github.com/NBISweden/workshop-scRNAseq/blob/master/labs/data/bone_marrow/"
file_list <- c("trajectory_seurat_filtered.rds")
for (i in file_list) {
download.file(url = paste0(webpath, i, "?raw=true"), destfile = paste0(i))
}If you have been using the scran/scater ,
Seurat or Scanpy pipelines with your own data,
you need to reach to the point where can find get:
We already have pre-computed and subseted the dataset (with 6688 cells and 3585 genes) following the analysis steps in this course. We then saved the objects, so you can use common tools to open and start to work with them (either in R or Python).
obj <- readRDS("trajectory_seurat_filtered.rds")
# Calculate cluster centroids (for plotting the labels later)
mm <- sparse.model.matrix(~0 + factor(obj$clusters_use))
colnames(mm) <- levels(factor(obj$clusters_use))
centroids2d <- as.matrix(t(t(obj@reductions$umap@cell.embeddings) %*% mm)/Matrix::colSums(mm))Lets visualize which clusters we have in our dataset:
vars <- c("batches", "dataset", "clusters_use", "Phase")
pl <- list()
for (i in vars) {
pl[[i]] <- DimPlot(obj, group.by = i, label = T) + theme_void() + NoLegend()
}
plot_grid(plotlist = pl)You can check, for example how many cells are in each cluster:
table(obj$clusters)##
## 1 2 5 6 7 8 9 11 12 13 14 15 16 17 18 19 20 21 22 23
## 128 71 90 160 147 120 160 130 132 78 90 150 140 76 141 90 98 149 90 10
## 25 26 27 28 29 32 33 34 35 36 37 38 41 43 44 45 46 47 49 50
## 56 154 98 76 125 150 150 146 150 148 135 128 145 134 110 149 140 113 132 85
## 52 53 54 55 57 58 59 60 61
## 126 129 57 129 147 127 118 120 101
It is crucial that you performing analysis of a dataset understands what is going on, what are the clusters you see in your data and most importantly How are the clusters related to each other?. Well, let’s explore the data a bit. With the help of this table, write down which cluster numbers in your dataset express these key markers.
| Marker | Cell Type |
|---|---|
| Cd34 | HSC progenitor |
| Ms4a1 | B cell lineage |
| Cd3e | T cell lineage |
| Ltf | Granulocyte lineage |
| Cst3 | Monocyte lineage |
| Mcpt8 | Mast Cell lineage |
| Alas2 | RBC lineage |
| Siglech | Dendritic cell lineage |
| C1qc | Macrophage cell lineage |
| Pf4 | Megakaryocyte cell lineage |
vars <- c("Cd34", "Ms4a1", "Cd3e", "Ltf", "Cst3", "Mcpt8", "Alas2", "Siglech", "C1qc",
"Pf4")
pl <- list()
pl <- list(DimPlot(obj, group.by = "clusters_use", label = T) + theme_void() + NoLegend())
for (i in vars) {
pl[[i]] <- FeaturePlot(obj, features = i, order = T) + theme_void() + NoLegend()
}
plot_grid(plotlist = pl)Another way to better explore your data is look in higher dimensions, to really get a sense for what is right or wrong. As mentioned in the dimensionality reduction exercises, here we ran UMAP with 3 dimensions (IMPORTANT: the UMAP needs to be computed to results in exactly 3 dimensions).
Since the steps below are identical to both Seurat and
Scran pipelines, we ill extract the matrices from both, so
it is clear what is being used where and to remove long lines of code
used to get those matrices. We will use them all. Plot in 3D with
Plotly:
df <- data.frame(obj@reductions$umap3d@cell.embeddings, variable = factor(obj$clusters_use))
colnames(df)[1:3] <- c("UMAP_1", "UMAP_2", "UMAP_3")
p_State <- plot_ly(df, x = ~UMAP_1, y = ~UMAP_2, z = ~UMAP_3, color = ~variable,
colors = pal, size = 0.5)
try(htmlwidgets::saveWidget(p_State, selfcontained = T, "umap_3d_clustering_plotly.html"),
silent = T)
browseURL("umap_3d_clustering_plotly.html")
p_StateWe can now compute the lineages on these dataset.
# Define lineage ends
ENDS <- c("17","27","25","16","26","53","49")
set.seed(1)
lineages <- as.SlingshotDataSet(getLineages(
data = obj@reductions$umap3d@cell.embeddings,
clusterLabels = obj$clusters_use,
dist.method = "mnn", # It can be: "simple", "scaled.full", "scaled.diag", "slingshot" or "mnn"
end.clus = ENDS, # You can also define the ENDS!
start.clus = "34")) # define where to START the trajectories
# IF NEEDED, ONE CAN ALSO MANULALLY EDIT THE LINEAGES, FOR EXAMPLE:
# sel <- sapply( lineages@lineages, function(x){rev(x)[1]} ) %in% ENDS
# lineages@lineages <- lineages@lineages[ sel ]
# names(lineages@lineages) <- paste0("Lineage",1:length(lineages@lineages))
# lineages
# Change the reduction to our "fixed" UMAP2d (FOR VISUALISATION ONLY)
lineages@reducedDim <- obj@reductions$umap@cell.embeddings
mypar() ; plot(obj@reductions$umap@cell.embeddings, col = pal[obj$clusters_use], cex=.5,pch = 16)
lines(lineages, lwd = 1, col = 'black', cex=2 )
text(centroids2d, labels = rownames(centroids2d),cex=0.8,font=2,col = "white")Much better!
Once the clusters are connected, Slingshot allows you to transform them to a smooth trajectory using principal curves. This is an algorithm that iteratively changes an initial curve to better match the data points. It was developed for linear data. To apply it to single-cell data, slingshot adds two enhancements:
Since the function getCurves() takes some time to run,
we can speed up the convergence of the curve fitting process by reducing
the amount of cells to use in each lineage. Ideally you could all cells,
but here we had set approx_points to 300 to speed up. Feel
free to adjust that for your dataset.
# Define curves
curves <- as.SlingshotDataSet(getCurves(data = lineages, thresh = 0.1, stretch = 0.1,
allow.breaks = F, approx_points = 100))
curves## class: SlingshotDataSet
##
## Samples Dimensions
## 5828 2
##
## lineages: 7
## Lineage1: 34 18 36 33 55 59 44 60 58 29 8 43 47 49
## Lineage2: 34 18 11 15 46 9 1 2 5 13 28 17
## Lineage3: 34 18 11 15 35 7 32 6 54 25
## Lineage4: 34 18 11 15 35 7 32 6 27
## Lineage5: 34 18 36 21 12 20 16
## Lineage6: 34 18 36 33 55 38 53
## Lineage7: 34 18 36 26
##
## curves: 7
## Curve1: Length: 6.7241 Samples: 2161.05
## Curve2: Length: 7.3487 Samples: 2097.02
## Curve3: Length: 3.5349 Samples: 1502.25
## Curve4: Length: 2.5623 Samples: 1387.66
## Curve5: Length: 2.9268 Samples: 979.78
## Curve6: Length: 2.8976 Samples: 1086.34
## Curve7: Length: 2.1323 Samples: 644.86
# Plots
plot(obj@reductions$umap@cell.embeddings, col = pal[obj$clusters_use], pch = 16)
lines(curves, lwd = 2, col = "black")
text(centroids2d, labels = rownames(centroids2d), cex = 1, font = 2)With those results in hands, we can now compute the differentiation pseudotime.
pseudotime <- slingPseudotime(curves, na = FALSE)
cellWeights <- slingCurveWeights(curves)
x <- rowMeans(pseudotime)
x <- x/max(x)
o <- order(x)
mypar()
plot(obj@reductions$umap@cell.embeddings[o, ], main = paste0("pseudotime"), pch = 16,
cex = 0.4, axes = F, xlab = "", ylab = "", col = colorRampPalette(c("grey70",
"orange3", "firebrick", "purple4"))(99)[x[o] * 98 + 1])
points(centroids2d, cex = 2.5, pch = 16, col = "#FFFFFF99")
text(centroids2d, labels = rownames(centroids2d), cex = 1, font = 2)IMPORTANT: The pseudotime represents the distance of every cell to the starting cluster!
The main way to interpret a trajectory is to find genes that change along the trajectory. There are many ways to define differential expression along a trajectory:
tradeSeq is a recently proposed algorithm to find
trajectory differentially expressed genes. It works by smoothing the
gene expression along the trajectory by fitting a smoother using
generalized additive models (GAMs), and testing whether certain
coefficients are statistically different between points in the
trajectory.
BiocParallel::register(BiocParallel::MulticoreParam())The fitting of GAMs can take quite a while, so for demonstration purposes we first do a very stringent filtering of the genes.
IMPORTANT: In an ideal experiment, you would use all the genes, or at least those defined as being variable.
sel_cells <- split(colnames(obj@assays$RNA@data), obj$clusters_use)
sel_cells <- unlist(lapply(sel_cells, function(x) {
set.seed(1)
return(sample(x, 20))
}))
gv <- as.data.frame(na.omit(scran::modelGeneVar(obj@assays$RNA@data[, sel_cells])))
gv <- gv[order(gv$bio, decreasing = T), ]
sel_genes <- sort(rownames(gv)[1:500])Fitting the model:
sceGAM <- fitGAM(counts = drop0(obj@assays$RNA@data[sel_genes, sel_cells]), pseudotime = pseudotime[sel_cells,
], cellWeights = cellWeights[sel_cells, ], nknots = 5, verbose = T, parallel = T,
sce = TRUE, BPPARAM = BiocParallel::MulticoreParam())##
|
| | 0%
|
|== | 2%
|
|==== | 5%
|
|===== | 8%
|
|======= | 10%
|
|========= | 12%
|
|========== | 15%
|
|============ | 18%
|
|============== | 20%
|
|================ | 22%
|
|================== | 25%
|
|=================== | 28%
|
|===================== | 30%
|
|======================= | 32%
|
|======================== | 35%
|
|========================== | 38%
|
|============================ | 40%
|
|============================== | 42%
|
|================================ | 45%
|
|================================= | 48%
|
|=================================== | 50%
|
|===================================== | 52%
|
|====================================== | 55%
|
|======================================== | 58%
|
|========================================== | 60%
|
|============================================ | 62%
|
|============================================== | 65%
|
|=============================================== | 68%
|
|================================================= | 70%
|
|=================================================== | 72%
|
|==================================================== | 75%
|
|====================================================== | 78%
|
|======================================================== | 80%
|
|========================================================== | 82%
|
|============================================================ | 85%
|
|============================================================= | 88%
|
|=============================================================== | 90%
|
|================================================================= | 92%
|
|================================================================== | 95%
|
|==================================================================== | 98%
|
|======================================================================| 100%
plotGeneCount(curves, clusters = obj$clusters_use, models = sceGAM)lineages## class: SlingshotDataSet
##
## Samples Dimensions
## 5828 2
##
## lineages: 7
## Lineage1: 34 18 36 33 55 59 44 60 58 29 8 43 47 49
## Lineage2: 34 18 11 15 46 9 1 2 5 13 28 17
## Lineage3: 34 18 11 15 35 7 32 6 54 25
## Lineage4: 34 18 11 15 35 7 32 6 27
## Lineage5: 34 18 36 21 12 20 16
## Lineage6: 34 18 36 33 55 38 53
## Lineage7: 34 18 36 26
##
## curves: 0
lc <- sapply(lineages@lineages, function(x) {
rev(x)[1]
})
names(lc) <- gsub("Lineage", "L", names(lc))
mypar()
plot(obj@reductions$umap@cell.embeddings, col = pal[obj$clusters_use], pch = 16)
lines(curves, lwd = 2, col = "black")
points(centroids2d[lc, ], col = "black", pch = 16, cex = 4)
text(centroids2d[lc, ], labels = names(lc), cex = 1, font = 2, col = "white")We can first look at general trends of gene expression across pseudotime.
res <- na.omit(associationTest(sceGAM, contrastType = "consecutive"))
res <- res[res$pvalue < 0.001, ]
res <- res[res$waldStat > mean(res$waldStat), ]
res <- res[order(res$waldStat, decreasing = T), ]
res[1:10, ]We can plot their expression:
mypar(4, 4, mar = c(0.1, 0.1, 2, 1))
plot(obj@reductions$umap@cell.embeddings, col = pal[obj$clusters_use], cex = 0.5,
pch = 16, axes = F, xlab = "", ylab = "")
lines(curves, lwd = 2, col = "black")
points(centroids2d[lc, ], col = "black", pch = 15, cex = 3, xpd = T)
text(centroids2d[lc, ], labels = names(lc), cex = 1, font = 2, col = "white", xpd = T)
vars <- rownames(res[1:15, ])
vars <- na.omit(vars[vars != "NA"])
for (i in vars) {
x <- drop0(obj@assays$RNA@data)[i, ]
x <- (x - min(x))/(max(x) - min(x))
o <- order(x)
plot(obj@reductions$umap@cell.embeddings[o, ], main = paste0(i), pch = 16, cex = 0.5,
axes = F, xlab = "", ylab = "", col = colorRampPalette(c("lightgray", "grey60",
"navy"))(99)[x[o] * 98 + 1])
}We can define custom pseudotime values of interest if we’re interested in genes that change between particular point in pseudotime. By default, we can look at differences between start and end:
res <- na.omit(startVsEndTest(sceGAM, pseudotimeValues = c(0, 1)))
res <- res[res$pvalue < 0.001, ]
res <- res[res$waldStat > mean(res$waldStat), ]
res <- res[order(res$waldStat, decreasing = T), ]
res[1:10, 1:6]You can see now that there are several more columns, one for each lineage. This table represents the differential expression within each lineage, to identify which genes go up or down. Let’s check lineage 1:
# Get the top UP and Down regulated in lineage 1
res_lin1 <- sort(setNames(res$logFClineage1, rownames(res)))
vars <- names(c(rev(res_lin1)[1:7], res_lin1[1:8]))
vars <- na.omit(vars[vars != "NA"])
mypar(4, 4, mar = c(0.1, 0.1, 2, 1))
plot(obj@reductions$umap@cell.embeddings, col = pal[obj$clusters_use], cex = 0.5,
pch = 16, axes = F, xlab = "", ylab = "")
lines(curves, lwd = 2, col = "black")
points(centroids2d[lc, ], col = "black", pch = 15, cex = 3, xpd = T)
text(centroids2d[lc, ], labels = names(lc), cex = 1, font = 2, col = "white", xpd = T)
for (i in vars) {
x <- drop0(obj@assays$RNA@data)[i, ]
x <- (x - min(x))/(max(x) - min(x))
o <- order(x)
plot(obj@reductions$umap@cell.embeddings[o, ], main = paste0(i), pch = 16, cex = 0.5,
axes = F, xlab = "", ylab = "", col = colorRampPalette(c("lightgray", "grey60",
"navy"))(99)[x[o] * 98 + 1])
}More interesting are genes that are different between two branches. We may have seen some of these genes already pop up in previous analyses of pseudotime. There are several ways to define “different between branches”, and each have their own functions:
diffEndTestearlyDETestpatternTestres <- na.omit(diffEndTest(sceGAM))
res <- res[res$pvalue < 0.001, ]
res <- res[res$waldStat > mean(res$waldStat), ]
res <- res[order(res$waldStat, decreasing = T), ]
res[1:10, ]You can see now that there are even more columns, one for the pair-wise comparison between each lineage. Let’s check lineage 1 vs lineage 2:
# Get the top UP and Down regulated in lineage 1 vs 2
res_lin1_2 <- sort(setNames(res$logFC1_2, rownames(res)))
vars <- names(c(rev(res_lin1_2)[1:7], res_lin1_2[1:8]))
vars <- na.omit(vars[vars != "NA"])
mypar(4, 4, mar = c(0.1, 0.1, 2, 1))
plot(obj@reductions$umap@cell.embeddings, col = pal[obj$clusters_use], cex = 0.5,
pch = 16, axes = F, xlab = "", ylab = "")
lines(curves, lwd = 2, col = "black")
points(centroids2d[lc, ], col = "black", pch = 15, cex = 3, xpd = T)
text(centroids2d[lc, ], labels = names(lc), cex = 1, font = 2, col = "white", xpd = T)
for (i in vars) {
x <- drop0(obj@assays$RNA@data)[i, ]
x <- (x - min(x))/(max(x) - min(x))
o <- order(x)
plot(obj@reductions$umap@cell.embeddings[o, ], main = paste0(i), pch = 16, cex = 0.5,
axes = F, xlab = "", ylab = "", col = colorRampPalette(c("lightgray", "grey60",
"navy"))(99)[x[o] * 98 + 1])
}Check out this vignette for a more in-depth overview of tradeSeq and many other differential expression tests.
Cannoodt, Robrecht, Wouter Saelens, and Yvan Saeys. 2016. “Computational Methods for Trajectory Inference from Single-Cell Transcriptomics.” European Journal of Immunology 46 (11): 2496–2506. doi.
Saelens, Wouter, Robrecht Cannoodt, Helena Todorov, and Yvan Saeys. 2019. “A Comparison of Single-Cell Trajectory Inference Methods.” Nature Biotechnology 37 (5): 547–54. doi.
Before computing differential gene expression, sometimes it is a good idea to make sure our dataset is somewhat homogeneous (without very strong batch effects). In this dataset, we actually used data from 4 different technologies (Drop-seq, SmartSeq2 and 10X) and therefore massive differences in read counts can be observed:
# SEURAT
VlnPlot(obj, features = "nUMI", group.by = "batches")
# SCRAN
plotColData(sce, y = "nUMI", x = "batches", colour_by = "batches")Since we are not interested in the effects of the batches in this example, but only the differentiation paths for each cell type. We can use the integrated space of harmony embedding (where we removed batch effects). Since the harmony (same applies to MNN, SCANORAMA, LIGER ) is a corrected version of PCA, we can multiply the harmony embedding with PCA loadings to generate batch-corrected “pseudo counts”. Note that we can only reconstruct data from the highly variable genes that were used to compute PCA and HARMONY.
# Get the gene means and standard deviation
library(sparseMatrixStats)
genes <- rownames(PCA_loadings)
gene_means <- rowMeans2(filt_NORM_COUNTS[genes, ])
gene_sd <- sqrt(rowVars(filt_NORM_COUNTS[genes, ]))
# Project normalized gene counts
CORRECTED_NORMCOUNTS <- t(filt_HARMONY %*% t(PCA_loadings)) * gene_sd + gene_means -
0.02
CORRECTED_NORMCOUNTS <- Matrix(round(CORRECTED_NORMCOUNTS, 3), sparse = T)
CORRECTED_NORMCOUNTS@x[CORRECTED_NORMCOUNTS@x < 0] <- 0
CORRECTED_NORMCOUNTS <- drop0(CORRECTED_NORMCOUNTS)
# Transform the normalized data back to raw counts (used for differential
# expression)
CORRECTED_COUNTS <- round((expm1(CORRECTED_NORMCOUNTS)) * 1000)Let’s compare how the normalized data compares to the batch-corrected one.
mypar(3, 3)
plot(obj@reductions$umap@cell.embeddings, type = "n")
draw_graph(layout = obj@reductions$umap@cell.embeddings, graph = filt_KNN)
points(obj@reductions$umap@cell.embeddings, col = pal[filt_clustering], pch = 16)
text(centroids2d[, 1], centroids2d[, 2], labels = rownames(centroids2d), cex = 0.8,
font = 2)
vars <- c("Cd34", "Ms4a1", "Cd3e", "Ltf", "Cst3", "Mcpt8", "Alas2", "Siglech")
for (i in vars) {
plot(filt_NORM_COUNTS[i, ], CORRECTED_NORMCOUNTS[i, ], main = i, pch = 16, cex = 0.4)
rr <- c(diff(range(filt_NORM_COUNTS[i, ]))/50, (range(CORRECTED_NORMCOUNTS[i,
])))
polygon(c(-rr[1], -rr[1], rr[1], rr[1]), c(rr[3], rr[2], rr[2], rr[3]), border = "red")
text(rr[1], max(CORRECTED_NORMCOUNTS[i, ]), " < Imputed\n counts", adj = c(0,
1), col = "red", font = 2)
}IMPORTANT: Please note in the graphs above that there is a significant amount of imputation (i.e., we artificially add counts to certain cells where we’d expect to see ). Please keep this in mind and use these matrices with caution in downstream analysis!
Let’s also take a closer inspection on the UMAPs:
mypar(4, 5, mar = c(0.1, 0.1, 2, 1))
vars <- c("Cd34", "Ms4a1", "Cd3e", "Ltf", "Cst3", "Mcpt8", "Alas2", "Siglech", "C1qc")
for (j in c("filt_NORM_COUNTS", "CORRECTED_NORMCOUNTS")) {
plot(obj@reductions$umap@cell.embeddings, type = "n", axes = F, xlab = "", ylab = "",
main = j)
draw_graph(layout = obj@reductions$umap@cell.embeddings, graph = filt_KNN)
points(obj@reductions$umap@cell.embeddings, col = pal[obj$clusters_use], pch = 16)
text(centroids2d, labels = rownames(centroids2d), cex = 0.8, font = 2)
for (i in vars) {
x <- get(j)[i, ]
x <- x - min(x)/(max(x) - min(x))
o <- order(x)
plot(obj@reductions$umap@cell.embeddings[o, ], main = paste0(i), pch = 16,
cex = 0.4, axes = F, xlab = "", ylab = "", col = colorRampPalette(c("lightgray",
"blue"))(99)[x[o] * 98 + 1])
}
}